library(tidyverse)
library(corrplot)
library(leaps)
library(performance)
library(MASS)
library(caret)
library(sjPlot)
After importing the csv file containing the County Demographic Information (CDI) data, we notice that crimes, physicians, and hospital beds are given as numbers, while other info are given as proportions. We therefore compute the number of crimes, physicians, and hospital beds per 1000 people.
cdi_data = read_csv("./data/cdi.csv") %>%
janitor::clean_names() %>%
mutate(
cty_state = str_c(cty,",",state),
docs_rate_1000 = 1000 * docs/pop,
# Compute number of doctors/hospital beds per 1000 people.
beds_rate_1000 = 1000 * beds/pop,
density = as.numeric(pop)/as.numeric(area),
crime_rate_1000 = 1000 * crimes/pop) %>%
# Compute number of crimes per 1000 people.
dplyr::select(-docs,-beds,-crimes) %>%
relocate(id,cty_state,cty)
#knitr::kable(head(cdi_data))
We then take a closer look of each variables, calculate the pairwise correlations between variables, and list all the correlations between the crime rate (our interest) and all other variables.
cdi_data_exp = cdi_data %>%
dplyr::select(-id,-cty,-state, -cty_state)
par(mfrow=c(4,3))
boxplot(cdi_data_exp$area,main="Area")
boxplot(cdi_data_exp$pop,main="Population")
boxplot(cdi_data_exp$pop18,main="Population 18-34")
boxplot(cdi_data_exp$pop65,main="Population 65+")
boxplot(cdi_data_exp$hsgrad,main="Highschool grads")
boxplot(cdi_data_exp$bagrad,main="Bachelor's grads")
#par(mfrow=c(2,3))
boxplot(cdi_data_exp$poverty,main="Poverty Rate")
boxplot(cdi_data_exp$unemp,main="Unemployment Rate")
boxplot(cdi_data_exp$pcincome,main="Income Per Capita")
boxplot(cdi_data_exp$totalinc,main="Income Total")
boxplot(cdi_data_exp$docs_rate_1000,main="Active Physicians")
boxplot(cdi_data_exp$beds_rate_1000,main="Hospital Beds")
boxplot of continuous variables distribution
par(mfrow=c(1,1))
ggplot(cdi_data,aes(region)) +
geom_histogram(binwidth = 0.5) +
theme_classic() +
xlab("Region")+
ylab("Count") +
labs(title = "Histogram: Counts of four regions")
Histogram of catagorical variable:region distribution
boxplot(cdi_data_exp$crime_rate_1000,main="Boxplot of Crime Rate",horizontal = TRUE)
boxplot of dependent variable: crime rate
pairs(cdi_data_exp)
Pairwise plot
# correlation plot
cdi_data_cor = cor(cdi_data_exp)
corrplot(cdi_data_cor, type = "upper", diag = FALSE)
Correlation heatmap
crime_1000_cor = data.frame(cdi_data_cor) %>%
dplyr::select("Crime Rate (Per 1000)" = crime_rate_1000) %>%
t()
#knitr::kable(crime_1000_cor,digits = 2)
cdi_data = cdi_data %>%
dplyr::select(-id,-cty_state, -cty,-state) %>%
mutate(region = factor(region))
set.seed(1)
dt = sort(sample(nrow(cdi_data), nrow(cdi_data)*.9))
train_data = cdi_data[dt,]
test_data = cdi_data[-dt,]
# Remove high leverage points
cdi_data_clean = train_data[train_data$area >= quantile(train_data$area,0.002) & train_data$area <= quantile(train_data$area,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop >= quantile(cdi_data_clean$pop,0.002) & cdi_data_clean$pop <= quantile(cdi_data_clean$pop,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop18 >= quantile(cdi_data_clean$pop18,0.002) & cdi_data_clean$pop18 <= quantile(cdi_data_clean$pop18,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pop65 >= quantile(cdi_data_clean$pop65,0.002) & cdi_data_clean$pop65 <= quantile(cdi_data_clean$pop65,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$hsgrad >= quantile(cdi_data_clean$hsgrad,0.002) & cdi_data_clean$hsgrad <= quantile(cdi_data_clean$hsgrad,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$bagrad >= quantile(cdi_data_clean$bagrad,0.002) & cdi_data_clean$bagrad <= quantile(cdi_data_clean$bagrad,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$poverty >= quantile(cdi_data_clean$poverty,0.002) & cdi_data_clean$poverty <= quantile(cdi_data_clean$poverty,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$unemp >= quantile(cdi_data_clean$unemp,0.002) & cdi_data_clean$unemp <= quantile(cdi_data_clean$unemp,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$pcincome >= quantile(cdi_data_clean$pcincome,0.002) & cdi_data_clean$pcincome <= quantile(cdi_data_clean$pcincome,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$totalinc >= quantile(cdi_data_clean$totalinc,0.002) & cdi_data_clean$totalinc <= quantile(cdi_data_clean$totalinc,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$docs_rate_1000 >= quantile(cdi_data_clean$docs_rate_1000,0.002) & cdi_data_clean$docs_rate_1000 <= quantile(cdi_data_clean$docs_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$beds_rate_1000 >= quantile(cdi_data_clean$beds_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$beds_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$beds_rate_1000 >= quantile(cdi_data_clean$beds_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$beds_rate_1000,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$density >= quantile(cdi_data_clean$density,0.002) & cdi_data_clean$density <= quantile(cdi_data_clean$density,0.998),]
cdi_data_clean = cdi_data_clean[cdi_data_clean$crime_rate_1000 >= quantile(cdi_data_clean$crime_rate_1000,0.002) & cdi_data_clean$beds_rate_1000 <= quantile(cdi_data_clean$crime_rate_1000,0.998),]
par(mfrow=c(4,3))
boxplot(cdi_data_clean$area,main="Area")
boxplot(cdi_data_clean$pop,main="Population")
boxplot(cdi_data_clean$pop18,main="Population 18-34")
boxplot(cdi_data_clean$pop65,main="Population 65+")
boxplot(cdi_data_clean$hsgrad,main="Highschool grads")
boxplot(cdi_data_clean$bagrad,main="Bachelor's grads")
boxplot(cdi_data_clean$poverty,main="Poverty Rate")
boxplot(cdi_data_clean$unemp,main="Unemployment Rate")
boxplot(cdi_data_clean$pcincome,main="Income Per Capita")
boxplot(cdi_data_clean$totalinc,main="Income Total")
boxplot(cdi_data_clean$docs_rate_1000,main="Active Physicians")
boxplot(cdi_data_clean$beds_rate_1000,main="Hospital Beds")
Boxplot of each continuous variables aftern cleaning outliers
Data used for building model:
cdi_model = cdi_data_clean
full.fit = lm(crime_rate_1000 ~ ., data = cdi_model)
summary(full.fit) %>%
broom::tidy() %>%
mutate(p_rank = rank(p.value))
## # A tibble: 17 × 6
## term estimate std.error statistic p.value p_rank
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -108. 29.0 -3.71 2.40e- 4 8
## 2 area -0.000699 0.000955 -0.732 4.65e- 1 16
## 3 pop 0.0000806 0.0000136 5.95 6.57e- 9 3
## 4 pop18 1.28 0.371 3.45 6.29e- 4 9
## 5 pop65 -0.0161 0.324 -0.0497 9.60e- 1 17
## 6 hsgrad 0.349 0.279 1.25 2.12e- 1 13
## 7 bagrad -0.694 0.330 -2.10 3.63e- 2 11
## 8 poverty 1.88 0.432 4.34 1.86e- 5 6
## 9 unemp 0.885 0.548 1.61 1.07e- 1 12
## 10 pcincome 0.00325 0.000620 5.25 2.63e- 7 4
## 11 totalinc -0.00341 0.000658 -5.19 3.52e- 7 5
## 12 region2 11.2 2.77 4.04 6.67e- 5 7
## 13 region3 29.8 2.70 11.1 1.47e-24 1
## 14 region4 24.3 3.60 6.75 6.31e-11 2
## 15 docs_rate_1000 1.30 1.26 1.03 3.03e- 1 14
## 16 beds_rate_1000 2.54 0.901 2.82 5.13e- 3 10
## 17 density 0.000701 0.000738 0.950 3.43e- 1 15
backward = step(full.fit, direction='backward') %>% broom::tidy() %>% rename(backward = "term")
## Start: AIC=2062.47
## crime_rate_1000 ~ area + pop + pop18 + pop65 + hsgrad + bagrad +
## poverty + unemp + pcincome + totalinc + region + docs_rate_1000 +
## beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - pop65 1 1 92276 2060.5
## - area 1 141 92416 2061.0
## - density 1 238 92513 2061.4
## - docs_rate_1000 1 280 92555 2061.6
## - hsgrad 1 411 92687 2062.1
## <none> 92275 2062.5
## - unemp 1 687 92963 2063.2
## - bagrad 1 1164 93439 2065.1
## - beds_rate_1000 1 2092 94367 2068.7
## - pop18 1 3138 95413 2072.7
## - poverty 1 4967 97242 2079.7
## - totalinc 1 7109 99384 2087.7
## - pcincome 1 7269 99545 2088.3
## - pop 1 9328 101603 2095.8
## - region 3 35147 127422 2174.9
##
## Step: AIC=2060.47
## crime_rate_1000 ~ area + pop + pop18 + hsgrad + bagrad + poverty +
## unemp + pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 +
## density
##
## Df Sum of Sq RSS AIC
## - area 1 144 92420 2059.1
## - density 1 238 92513 2059.4
## - docs_rate_1000 1 279 92555 2059.6
## - hsgrad 1 413 92689 2060.1
## <none> 92276 2060.5
## - unemp 1 698 92974 2061.2
## - bagrad 1 1163 93439 2063.1
## - beds_rate_1000 1 2217 94493 2067.2
## - pop18 1 4127 96403 2074.5
## - poverty 1 5160 97436 2078.4
## - totalinc 1 7152 99428 2085.9
## - pcincome 1 7324 99600 2086.5
## - pop 1 9371 101646 2094.0
## - region 3 35176 127451 2173.0
##
## Step: AIC=2059.05
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 +
## density
##
## Df Sum of Sq RSS AIC
## - docs_rate_1000 1 282 92702 2058.2
## - density 1 397 92817 2058.6
## - hsgrad 1 474 92894 2058.9
## <none> 92420 2059.1
## - unemp 1 626 93046 2059.5
## - bagrad 1 1195 93615 2061.8
## - beds_rate_1000 1 2220 94640 2065.8
## - pop18 1 4080 96500 2072.9
## - poverty 1 5090 97510 2076.7
## - totalinc 1 7017 99437 2083.9
## - pcincome 1 7263 99683 2084.8
## - pop 1 9229 101649 2092.0
## - region 3 35031 127452 2171.0
##
## Step: AIC=2058.16
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + region + beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - hsgrad 1 394 93095 2057.7
## <none> 92702 2058.2
## - density 1 580 93282 2058.4
## - unemp 1 637 93339 2058.7
## - bagrad 1 951 93653 2059.9
## - pop18 1 4264 96966 2072.7
## - poverty 1 5090 97792 2075.8
## - beds_rate_1000 1 5318 98020 2076.6
## - totalinc 1 6972 99674 2082.8
## - pcincome 1 7672 100373 2085.3
## - pop 1 9205 101907 2090.9
## - region 3 35510 128212 2171.2
##
## Step: AIC=2057.72
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome +
## totalinc + region + beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - density 1 435 93531 2057.4
## <none> 93095 2057.7
## - unemp 1 522 93618 2057.8
## - bagrad 1 561 93656 2057.9
## - pop18 1 3989 97084 2071.1
## - poverty 1 4766 97861 2074.0
## - beds_rate_1000 1 5331 98426 2076.2
## - totalinc 1 7222 100317 2083.1
## - pcincome 1 7324 100420 2083.5
## - pop 1 9501 102597 2091.4
## - region 3 35119 128215 2169.2
##
## Step: AIC=2057.43
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome +
## totalinc + region + beds_rate_1000
##
## Df Sum of Sq RSS AIC
## - unemp 1 504 94035 2057.4
## <none> 93531 2057.4
## - bagrad 1 709 94239 2058.2
## - pop18 1 4686 98217 2073.4
## - poverty 1 5568 99099 2076.7
## - beds_rate_1000 1 5625 99156 2076.9
## - totalinc 1 7389 100919 2083.3
## - pcincome 1 8910 102440 2088.8
## - pop 1 9968 103498 2092.6
## - region 3 34810 128341 2167.6
##
## Step: AIC=2057.4
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + pcincome +
## totalinc + region + beds_rate_1000
##
## Df Sum of Sq RSS AIC
## <none> 94035 2057.4
## - bagrad 1 1394 95428 2060.8
## - pop18 1 4681 98715 2073.2
## - beds_rate_1000 1 5122 99156 2074.9
## - totalinc 1 7551 101586 2083.8
## - poverty 1 8455 102489 2087.0
## - pcincome 1 10060 104095 2092.7
## - pop 1 10133 104167 2093.0
## - region 3 35812 129846 2169.8
both = step(full.fit, direction = "both") %>% broom::tidy() %>% rename(stepwise = "term")
## Start: AIC=2062.47
## crime_rate_1000 ~ area + pop + pop18 + pop65 + hsgrad + bagrad +
## poverty + unemp + pcincome + totalinc + region + docs_rate_1000 +
## beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - pop65 1 1 92276 2060.5
## - area 1 141 92416 2061.0
## - density 1 238 92513 2061.4
## - docs_rate_1000 1 280 92555 2061.6
## - hsgrad 1 411 92687 2062.1
## <none> 92275 2062.5
## - unemp 1 687 92963 2063.2
## - bagrad 1 1164 93439 2065.1
## - beds_rate_1000 1 2092 94367 2068.7
## - pop18 1 3138 95413 2072.7
## - poverty 1 4967 97242 2079.7
## - totalinc 1 7109 99384 2087.7
## - pcincome 1 7269 99545 2088.3
## - pop 1 9328 101603 2095.8
## - region 3 35147 127422 2174.9
##
## Step: AIC=2060.47
## crime_rate_1000 ~ area + pop + pop18 + hsgrad + bagrad + poverty +
## unemp + pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 +
## density
##
## Df Sum of Sq RSS AIC
## - area 1 144 92420 2059.1
## - density 1 238 92513 2059.4
## - docs_rate_1000 1 279 92555 2059.6
## - hsgrad 1 413 92689 2060.1
## <none> 92276 2060.5
## - unemp 1 698 92974 2061.2
## + pop65 1 1 92275 2062.5
## - bagrad 1 1163 93439 2063.1
## - beds_rate_1000 1 2217 94493 2067.2
## - pop18 1 4127 96403 2074.5
## - poverty 1 5160 97436 2078.4
## - totalinc 1 7152 99428 2085.9
## - pcincome 1 7324 99600 2086.5
## - pop 1 9371 101646 2094.0
## - region 3 35176 127451 2173.0
##
## Step: AIC=2059.05
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + region + docs_rate_1000 + beds_rate_1000 +
## density
##
## Df Sum of Sq RSS AIC
## - docs_rate_1000 1 282 92702 2058.2
## - density 1 397 92817 2058.6
## - hsgrad 1 474 92894 2058.9
## <none> 92420 2059.1
## - unemp 1 626 93046 2059.5
## + area 1 144 92276 2060.5
## + pop65 1 4 92416 2061.0
## - bagrad 1 1195 93615 2061.8
## - beds_rate_1000 1 2220 94640 2065.8
## - pop18 1 4080 96500 2072.9
## - poverty 1 5090 97510 2076.7
## - totalinc 1 7017 99437 2083.9
## - pcincome 1 7263 99683 2084.8
## - pop 1 9229 101649 2092.0
## - region 3 35031 127452 2171.0
##
## Step: AIC=2058.16
## crime_rate_1000 ~ pop + pop18 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + region + beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - hsgrad 1 394 93095 2057.7
## <none> 92702 2058.2
## - density 1 580 93282 2058.4
## - unemp 1 637 93339 2058.7
## + docs_rate_1000 1 282 92420 2059.1
## + area 1 147 92555 2059.6
## - bagrad 1 951 93653 2059.9
## + pop65 1 1 92701 2060.2
## - pop18 1 4264 96966 2072.7
## - poverty 1 5090 97792 2075.8
## - beds_rate_1000 1 5318 98020 2076.6
## - totalinc 1 6972 99674 2082.8
## - pcincome 1 7672 100373 2085.3
## - pop 1 9205 101907 2090.9
## - region 3 35510 128212 2171.2
##
## Step: AIC=2057.72
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome +
## totalinc + region + beds_rate_1000 + density
##
## Df Sum of Sq RSS AIC
## - density 1 435 93531 2057.4
## <none> 93095 2057.7
## - unemp 1 522 93618 2057.8
## - bagrad 1 561 93656 2057.9
## + hsgrad 1 394 92702 2058.2
## + area 1 202 92894 2058.9
## + docs_rate_1000 1 201 92894 2058.9
## + pop65 1 4 93091 2059.7
## - pop18 1 3989 97084 2071.1
## - poverty 1 4766 97861 2074.0
## - beds_rate_1000 1 5331 98426 2076.2
## - totalinc 1 7222 100317 2083.1
## - pcincome 1 7324 100420 2083.5
## - pop 1 9501 102597 2091.4
## - region 3 35119 128215 2169.2
##
## Step: AIC=2057.43
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + unemp + pcincome +
## totalinc + region + beds_rate_1000
##
## Df Sum of Sq RSS AIC
## - unemp 1 504 94035 2057.4
## <none> 93531 2057.4
## + density 1 435 93095 2057.7
## + area 1 388 93143 2057.9
## + docs_rate_1000 1 351 93180 2058.1
## - bagrad 1 709 94239 2058.2
## + hsgrad 1 249 93282 2058.4
## + pop65 1 0 93530 2059.4
## - pop18 1 4686 98217 2073.4
## - poverty 1 5568 99099 2076.7
## - beds_rate_1000 1 5625 99156 2076.9
## - totalinc 1 7389 100919 2083.3
## - pcincome 1 8910 102440 2088.8
## - pop 1 9968 103498 2092.6
## - region 3 34810 128341 2167.6
##
## Step: AIC=2057.4
## crime_rate_1000 ~ pop + pop18 + bagrad + poverty + pcincome +
## totalinc + region + beds_rate_1000
##
## Df Sum of Sq RSS AIC
## <none> 94035 2057.4
## + unemp 1 504 93531 2057.4
## + density 1 417 93618 2057.8
## + docs_rate_1000 1 371 93664 2057.9
## + area 1 260 93774 2058.4
## + hsgrad 1 165 93870 2058.8
## + pop65 1 12 94023 2059.4
## - bagrad 1 1394 95428 2060.8
## - pop18 1 4681 98715 2073.2
## - beds_rate_1000 1 5122 99156 2074.9
## - totalinc 1 7551 101586 2083.8
## - poverty 1 8455 102489 2087.0
## - pcincome 1 10060 104095 2092.7
## - pop 1 10133 104167 2093.0
## - region 3 35812 129846 2169.8
Variables chosen from stepwise regression:
bind_cols(backward[-1,1],both[-1,1]) %>% knitr::kable(caption = "Vairable selected from stepwise regression")
| backward | stepwise |
|---|---|
| pop | pop |
| pop18 | pop18 |
| bagrad | bagrad |
| poverty | poverty |
| pcincome | pcincome |
| totalinc | totalinc |
| region2 | region2 |
| region3 | region3 |
| region4 | region4 |
| beds_rate_1000 | beds_rate_1000 |
sb = regsubsets(crime_rate_1000 ~ ., data = cdi_model, nvmax = 14)
sumsb = summary(sb) # pop pop18 hsgrad bagrad poverty pcincome totalinc region beds_rate_1000 density
coef(sb, id = 12)
## (Intercept) pop pop18 bagrad poverty
## -8.317643e+01 8.025821e-05 1.249522e+00 -3.679141e-01 1.664217e+00
## unemp pcincome totalinc region2 region3
## 7.482408e-01 3.197803e-03 -3.406110e-03 1.203348e+01 2.967462e+01
## region4 beds_rate_1000 density
## 2.462527e+01 3.087835e+00 8.669407e-04
par(mfrow=c(1,2))
plot(2:15, sumsb$cp, xlab="No. of parameters", ylab="Cp Statistic")
abline(0,1)
plot(2:15, sumsb$adjr2, xlab="No of parameters", ylab="Adj R2")
Subset selection for best parameter numbers
According to the output, we determine that the number of variables should be above 12 because \(C_p \leq p\). Based on this analysis, we find that unemp and density could also be selected.
We need to remove totalinc, because it can be replaced. totalinc = pcincome * pop.
fit_nest = lm(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density, data = cdi_model)
summary(fit_nest)
##
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty +
## unemp + pcincome + pcincome * pop + region + beds_rate_1000 +
## density, data = cdi_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.123 -9.614 -1.011 8.440 57.956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.318e+01 1.549e+01 -5.371 1.42e-07 ***
## pop 8.026e-05 1.335e-05 6.011 4.59e-09 ***
## pop18 1.250e+00 3.208e-01 3.894 0.000118 ***
## bagrad -3.679e-01 2.519e-01 -1.461 0.145035
## poverty 1.664e+00 3.909e-01 4.257 2.66e-05 ***
## unemp 7.482e-01 5.310e-01 1.409 0.159717
## pcincome 3.198e-03 6.059e-04 5.277 2.29e-07 ***
## region2 1.203e+01 2.618e+00 4.597 5.98e-06 ***
## region3 2.967e+01 2.682e+00 11.065 < 2e-16 ***
## region4 2.463e+01 3.128e+00 7.873 4.27e-14 ***
## beds_rate_1000 3.088e+00 6.859e-01 4.502 9.14e-06 ***
## density 8.670e-04 6.738e-04 1.287 0.199066
## pop:pcincome -3.406e-09 6.500e-10 -5.240 2.76e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.22 on 354 degrees of freedom
## Multiple R-squared: 0.563, Adjusted R-squared: 0.5482
## F-statistic: 38.01 on 12 and 354 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_nest)
Diagnose plots of model without interaction terms
boxcox(fit_nest)
Boxcox plot of model without interaction terms
The peak of boxcox plot is close to 0.5~1. Try \(\sqrt{y}\) transformation
cdi_model_trans = cdi_model %>%
mutate(
y_sqrt = sqrt(crime_rate_1000)
)
fit_nest_trans = lm(y_sqrt ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density, data = cdi_model_trans)
summary(fit_nest_trans)
##
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + bagrad + poverty + unemp +
## pcincome + pcincome * pop + region + beds_rate_1000 + density,
## data = cdi_model_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0165 -0.6220 -0.0030 0.6178 3.4444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.813e+00 1.060e+00 -1.710 0.088124 .
## pop 5.046e-06 9.142e-07 5.520 6.56e-08 ***
## pop18 8.223e-02 2.197e-02 3.744 0.000212 ***
## bagrad -2.693e-02 1.725e-02 -1.562 0.119298
## poverty 9.395e-02 2.677e-02 3.510 0.000506 ***
## unemp 5.976e-02 3.636e-02 1.644 0.101141
## pcincome 2.049e-04 4.149e-05 4.938 1.22e-06 ***
## region2 8.599e-01 1.792e-01 4.798 2.37e-06 ***
## region3 2.098e+00 1.836e-01 11.427 < 2e-16 ***
## region4 1.863e+00 2.141e-01 8.700 < 2e-16 ***
## beds_rate_1000 2.227e-01 4.696e-02 4.742 3.07e-06 ***
## density 5.455e-05 4.613e-05 1.182 0.237840
## pop:pcincome -2.115e-10 4.450e-11 -4.753 2.93e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.11 on 354 degrees of freedom
## Multiple R-squared: 0.5515, Adjusted R-squared: 0.5363
## F-statistic: 36.28 on 12 and 354 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_nest_trans)
Diagnose plots of model without interaction terms
Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.
Our first model: \[crime\_rate\_1000 = pop + pop18 + bagrad + poverty + unemp \\ + pcincome + pcincome*pop + region beds\_rate\_1000 + density\]
According to Census Bureau, the number of persons below the official government poverty level was 33.6 million in 1990, representing 13.5 percent of the Nation’s population. Thus, we can use this criteria to divide poverty into two category: higher than national poverty rate and lower than national poverty rate.
poverty_status = cdi_model %>%
mutate(national_poverty = if_else(poverty > 13.5, "higher", "lower"))
ggplot(poverty_status, aes(x = pcincome, y = crime_rate_1000, color = national_poverty)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm", se = F, aes(group = national_poverty, color = national_poverty)) +
labs(
title = "Crime Rate and Per Capita Income by Poverty Status",
x = "Per Capita Income",
y = "Crime Rate ",
color = "Comparison with national avergae"
)
Interaction plot of Income Per Capita and Poverty
fit_int1 = lm(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density +
poverty*pcincome, data = cdi_model)
summary(fit_int1) %>% broom::tidy()
## # A tibble: 14 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.88e+1 1.75e+ 1 -2.80 5.46e- 3
## 2 pop 6.17e-5 1.39e- 5 4.44 1.19e- 5
## 3 pop18 1.14e+0 3.16e- 1 3.60 3.57e- 4
## 4 bagrad -2.62e-1 2.48e- 1 -1.05 2.93e- 1
## 5 poverty -2.54e+0 1.12e+ 0 -2.26 2.46e- 2
## 6 unemp 7.42e-1 5.20e- 1 1.43 1.55e- 1
## 7 pcincome 1.42e-3 7.43e- 4 1.91 5.72e- 2
## 8 region2 1.07e+1 2.59e+ 0 4.15 4.20e- 5
## 9 region3 2.80e+1 2.66e+ 0 10.5 1.19e-22
## 10 region4 2.22e+1 3.13e+ 0 7.08 7.60e-12
## 11 beds_rate_1000 2.01e+0 7.25e- 1 2.77 5.91e- 3
## 12 density 2.01e-4 6.81e- 4 0.295 7.68e- 1
## 13 pop:pcincome -2.56e-9 6.71e-10 -3.82 1.59e- 4
## 14 poverty:pcincome 2.80e-4 7.05e- 5 3.98 8.52e- 5
check_collinearity(fit_int1)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop 1.00 1.00 1.00
## pop18 2.10 1.45 0.48
## bagrad 2.61 1.62 0.38
## poverty 1.18 1.09 0.85
## unemp 1.69 1.30 0.59
## pcincome 1.12 1.06 0.89
## region 1.59 1.26 0.63
## beds_rate_1000 1.38 1.18 0.72
## density 1.01 1.01 0.99
## pop:pcincome 1.00 1.00 1.00
## poverty:pcincome 1.00 1.00 1.00
We notice that density, bagrad are not significant
# remove density
fit_int1 = lm(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 +
poverty*pcincome, data = cdi_model)
summary(fit_int1)
##
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty +
## unemp + pcincome + pcincome * pop + region + beds_rate_1000 +
## poverty * pcincome, data = cdi_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.560 -8.893 -0.811 8.674 64.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.943e+01 1.731e+01 -2.855 0.004553 **
## pop 6.171e-05 1.387e-05 4.448 1.16e-05 ***
## pop18 1.153e+00 3.109e-01 3.708 0.000242 ***
## bagrad -2.687e-01 2.467e-01 -1.089 0.276790
## poverty -2.594e+00 1.107e+00 -2.342 0.019719 *
## unemp 7.393e-01 5.195e-01 1.423 0.155595
## pcincome 1.431e-03 7.414e-04 1.930 0.054403 .
## region2 1.067e+01 2.575e+00 4.143 4.29e-05 ***
## region3 2.790e+01 2.647e+00 10.539 < 2e-16 ***
## region4 2.207e+01 3.110e+00 7.096 7.03e-12 ***
## beds_rate_1000 2.004e+00 7.238e-01 2.768 0.005932 **
## pop:pcincome -2.555e-09 6.699e-10 -3.814 0.000161 ***
## poverty:pcincome 2.856e-04 6.829e-05 4.182 3.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 354 degrees of freedom
## Multiple R-squared: 0.5816, Adjusted R-squared: 0.5674
## F-statistic: 41.01 on 12 and 354 DF, p-value: < 2.2e-16
check_collinearity(fit_int1)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop 1.00 1.00 1.00
## pop18 2.08 1.44 0.48
## bagrad 2.59 1.61 0.39
## poverty 1.18 1.09 0.85
## unemp 1.69 1.30 0.59
## pcincome 1.13 1.06 0.89
## region 1.59 1.26 0.63
## beds_rate_1000 1.39 1.18 0.72
## pop:pcincome 1.00 1.00 1.00
## poverty:pcincome 1.00 1.00 1.00
# remove bagrad
fit_int1 = lm(crime_rate_1000 ~
pop + pop18 +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 +
poverty*pcincome, data = cdi_model)
summary(fit_int1)
##
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + poverty + unemp +
## pcincome + pcincome * pop + region + beds_rate_1000 + poverty *
## pcincome, data = cdi_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.933 -8.984 -0.825 9.062 64.974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.143e+01 1.568e+01 -2.642 0.008603 **
## pop 6.001e-05 1.379e-05 4.352 1.76e-05 ***
## pop18 9.277e-01 2.321e-01 3.996 7.84e-05 ***
## poverty -2.777e+00 1.095e+00 -2.536 0.011647 *
## unemp 9.432e-01 4.847e-01 1.946 0.052468 .
## pcincome 9.798e-04 6.151e-04 1.593 0.112068
## region2 1.079e+01 2.573e+00 4.194 3.47e-05 ***
## region3 2.769e+01 2.641e+00 10.485 < 2e-16 ***
## region4 2.132e+01 3.033e+00 7.028 1.08e-11 ***
## beds_rate_1000 1.988e+00 7.238e-01 2.746 0.006339 **
## pop:pcincome -2.457e-09 6.640e-10 -3.700 0.000249 ***
## poverty:pcincome 2.957e-04 6.766e-05 4.371 1.63e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 355 degrees of freedom
## Multiple R-squared: 0.5802, Adjusted R-squared: 0.5672
## F-statistic: 44.61 on 11 and 355 DF, p-value: < 2.2e-16
check_collinearity(fit_int1)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop 1.00 1.00 1.00
## pop18 1.16 1.08 0.86
## poverty 1.17 1.08 0.86
## unemp 1.47 1.21 0.68
## pcincome 1.09 1.04 0.92
## region 1.47 1.21 0.68
## beds_rate_1000 1.39 1.18 0.72
## pop:pcincome 1.00 1.00 1.00
## poverty:pcincome 1.00 1.00 1.00
par(mfrow = c(2,2))
plot(fit_int1)
Diagnose plots with interaction terms:poverty*pcincome
boxcox(fit_int1)
Boxcox plot with interaction terms:poverty*pcincome
The peak of boxcox plot is close to 0.5~1. Try \(\sqrt{y}\) transformation
cdi_model_trans = cdi_model %>%
mutate(
y_sqrt = sqrt(crime_rate_1000)
)
fit_int1_trans = lm(y_sqrt ~
pop + pop18 +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 +
poverty*pcincome, data = cdi_model_trans)
summary(fit_int1_trans)
##
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + poverty + unemp + pcincome +
## pcincome * pop + region + beds_rate_1000 + poverty * pcincome,
## data = cdi_model_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9383 -0.5569 0.0166 0.6563 3.7968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.271e-01 1.078e+00 0.860 0.390234
## pop 3.748e-06 9.476e-07 3.955 9.24e-05 ***
## pop18 5.881e-02 1.596e-02 3.686 0.000264 ***
## poverty -1.882e-01 7.525e-02 -2.500 0.012857 *
## unemp 7.489e-02 3.331e-02 2.248 0.025191 *
## pcincome 5.891e-05 4.228e-05 1.394 0.164333
## region2 7.833e-01 1.768e-01 4.430 1.26e-05 ***
## region3 1.970e+00 1.815e-01 10.855 < 2e-16 ***
## region4 1.644e+00 2.085e-01 7.888 3.84e-14 ***
## beds_rate_1000 1.532e-01 4.975e-02 3.079 0.002236 **
## pop:pcincome -1.504e-10 4.563e-11 -3.295 0.001084 **
## poverty:pcincome 1.876e-05 4.650e-06 4.034 6.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.091 on 355 degrees of freedom
## Multiple R-squared: 0.5659, Adjusted R-squared: 0.5524
## F-statistic: 42.07 on 11 and 355 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_int1_trans)
Diagnose plots with interaction terms:poverty*pcincome
Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.
Our second model: \[crime\_rate\_1000 = pop + pop18 + poverty + unemp + pcincome + \\ pcincome*pop + region + beds\_rate\_1000 + poverty*pcincome\]
According to Census Bureau, national percent of persons 25 years old or older with bachelor’s degrees is 20.8%. Thus, we can use this criteria to divide bagrad into two category: higher than national bagrad and lower than national bargrad.
bagrad_status = cdi_model %>%
mutate(national_bagrad = if_else(bagrad > 20.8, "higher", "lower"))
ggplot(bagrad_status, aes(x = pcincome, y = crime_rate_1000, color = national_bagrad)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm", se = F, aes(group = national_bagrad, color = national_bagrad)) +
ylim(0,150) +
labs(
title = "Crime Rate and Per Capita Income by Percent Bachelor's Degrees Status",
x = "Per Capita Income",
y = "Crime Rate",
color = "Comparison with national avergae"
)
Interaction plot of Income Per Capita and Bachelor’s Degree Status
fit_int2 = lm(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density +
pcincome*bagrad, data = cdi_model)
summary(fit_int2)
##
## Call:
## lm(formula = crime_rate_1000 ~ pop + pop18 + bagrad + poverty +
## unemp + pcincome + pcincome * pop + region + beds_rate_1000 +
## density + pcincome * bagrad, data = cdi_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.88 -9.84 -0.85 8.19 61.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.141e+02 1.963e+01 -5.814 1.37e-08 ***
## pop 6.518e-05 1.453e-05 4.487 9.79e-06 ***
## pop18 1.048e+00 3.282e-01 3.193 0.001537 **
## bagrad 1.252e+00 6.865e-01 1.824 0.069046 .
## poverty 1.906e+00 3.995e-01 4.770 2.69e-06 ***
## unemp 9.002e-01 5.304e-01 1.697 0.090555 .
## pcincome 5.012e-03 9.351e-04 5.360 1.51e-07 ***
## region2 1.244e+01 2.603e+00 4.778 2.60e-06 ***
## region3 2.979e+01 2.662e+00 11.191 < 2e-16 ***
## region4 2.432e+01 3.107e+00 7.827 5.87e-14 ***
## beds_rate_1000 2.740e+00 6.944e-01 3.946 9.58e-05 ***
## density 1.007e-03 6.710e-04 1.500 0.134491
## pop:pcincome -2.706e-09 7.018e-10 -3.856 0.000137 ***
## bagrad:pcincome -8.009e-05 3.161e-05 -2.534 0.011723 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.09 on 353 degrees of freedom
## Multiple R-squared: 0.5708, Adjusted R-squared: 0.555
## F-statistic: 36.11 on 13 and 353 DF, p-value: < 2.2e-16
check_collinearity(fit_int2)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop 1.00 1.00 1.00
## pop18 1.38 1.17 0.72
## bagrad 1.63 1.27 0.62
## poverty 1.96 1.40 0.51
## unemp 1.46 1.21 0.69
## pcincome 1.14 1.07 0.87
## region 1.51 1.23 0.66
## beds_rate_1000 1.82 1.35 0.55
## density 1.02 1.01 0.98
## pop:pcincome 1.00 1.00 1.00
## bagrad:pcincome 1.00 1.00 1.00
par(mfrow = c(2,2))
plot(fit_int2)
Diagnose plots with interaction terms:pcincome*bagrad
boxcox(fit_int2)
Boxcox plot with interaction terms:pcincome*bagrad
The peak of boxcox plot is close to 0.5~1. Try \(\sqrt{y}\) transformation
fit_int2_trans = lm(y_sqrt ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density +
pcincome*bagrad, data = cdi_model_trans)
summary(fit_int2_trans)
##
## Call:
## lm(formula = y_sqrt ~ pop + pop18 + bagrad + poverty + unemp +
## pcincome + pcincome * pop + region + beds_rate_1000 + density +
## pcincome * bagrad, data = cdi_model_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9297 -0.6537 -0.0077 0.6205 3.6242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.111e+00 1.342e+00 -3.064 0.002351 **
## pop 3.926e-06 9.930e-07 3.954 9.28e-05 ***
## pop18 6.725e-02 2.244e-02 2.998 0.002914 **
## bagrad 9.341e-02 4.693e-02 1.990 0.047307 *
## poverty 1.119e-01 2.731e-02 4.098 5.18e-05 ***
## unemp 7.105e-02 3.626e-02 1.960 0.050833 .
## pcincome 3.396e-04 6.392e-05 5.314 1.91e-07 ***
## region2 8.899e-01 1.779e-01 5.002 8.98e-07 ***
## region3 2.107e+00 1.820e-01 11.577 < 2e-16 ***
## region4 1.840e+00 2.123e-01 8.666 < 2e-16 ***
## beds_rate_1000 1.968e-01 4.746e-02 4.147 4.22e-05 ***
## density 6.492e-05 4.587e-05 1.415 0.157840
## pop:pcincome -1.595e-10 4.797e-11 -3.325 0.000978 ***
## bagrad:pcincome -5.950e-06 2.161e-06 -2.753 0.006202 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 353 degrees of freedom
## Multiple R-squared: 0.561, Adjusted R-squared: 0.5448
## F-statistic: 34.69 on 13 and 353 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit_int2_trans)
Diagnose plots with interaction terms:pcincome*bagrad
Compare to the diagnose plots of untransformed model, we found that the residuals are more unevenly distributed. Therefore, transformed model is worse. We select the untransformed model.
Our third model: \[crime\_rate\_1000 = pop + pop18 + bagrad + poverty + unemp + pcincome + \\ pcincome*pop + region + beds\_rate\_1000 + density + pcincome*bagrad\]
Here is the summary table of the three models:
tab_model(fit_nest, fit_int1,fit_int2,show.se = TRUE,
pred.labels = c("Intercept", "Total population", "Percent of population aged 18-34", "Bachelors proportion",
"Percent below poverty level", "Percent unemployment", "Per Capita income", "Region-North Central", "Region-South","Region-West",
"Number of hospital beds per 1000 person",
"Density","Total Population*income per capita", "Poverty*income per capita", "Bachelors proportion*income per capita"),
dv.labels = c("Model 1", "Model 2", "Model 3"),
string.pred = "Coeffcient",
string.ci = "Conf. Int (95%)",
string.p = "P-Value",
title = "Table 5: Models summary"
)
| Model 1 | Model 2 | Model 3 | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coeffcient | Estimates | std. Error | Conf. Int (95%) | P-Value | Estimates | std. Error | Conf. Int (95%) | P-Value | Estimates | std. Error | Conf. Int (95%) | P-Value |
| Intercept | -83.18 | 15.49 | -113.63 – -52.72 | <0.001 | -41.43 | 15.68 | -72.27 – -10.59 | 0.009 | -114.11 | 19.63 | -152.72 – -75.51 | <0.001 |
| Total population | 0.00 | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 | 0.00 – 0.00 | <0.001 |
| Percent of population aged 18-34 | 1.25 | 0.32 | 0.62 – 1.88 | <0.001 | 0.93 | 0.23 | 0.47 – 1.38 | <0.001 | 1.05 | 0.33 | 0.40 – 1.69 | 0.002 |
| Bachelors proportion | -0.37 | 0.25 | -0.86 – 0.13 | 0.145 | 1.25 | 0.69 | -0.10 – 2.60 | 0.069 | ||||
| Percent below poverty level | 1.66 | 0.39 | 0.90 – 2.43 | <0.001 | -2.78 | 1.09 | -4.93 – -0.62 | 0.012 | 1.91 | 0.40 | 1.12 – 2.69 | <0.001 |
| Percent unemployment | 0.75 | 0.53 | -0.30 – 1.79 | 0.160 | 0.94 | 0.48 | -0.01 – 1.90 | 0.052 | 0.90 | 0.53 | -0.14 – 1.94 | 0.091 |
| Per Capita income | 0.00 | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 | -0.00 – 0.00 | 0.112 | 0.01 | 0.00 | 0.00 – 0.01 | <0.001 |
| Region-North Central | 12.03 | 2.62 | 6.88 – 17.18 | <0.001 | 10.79 | 2.57 | 5.73 – 15.85 | <0.001 | 12.44 | 2.60 | 7.32 – 17.56 | <0.001 |
| Region-South | 29.67 | 2.68 | 24.40 – 34.95 | <0.001 | 27.69 | 2.64 | 22.50 – 32.89 | <0.001 | 29.79 | 2.66 | 24.55 – 35.02 | <0.001 |
| Region-West | 24.63 | 3.13 | 18.47 – 30.78 | <0.001 | 21.32 | 3.03 | 15.35 – 27.28 | <0.001 | 24.32 | 3.11 | 18.21 – 30.43 | <0.001 |
| Number of hospital beds per 1000 person | 3.09 | 0.69 | 1.74 – 4.44 | <0.001 | 1.99 | 0.72 | 0.56 – 3.41 | 0.006 | 2.74 | 0.69 | 1.37 – 4.11 | <0.001 |
| Density | 0.00 | 0.00 | -0.00 – 0.00 | 0.199 | 0.00 | 0.00 | -0.00 – 0.00 | 0.134 | ||||
| Total Population*income per capita | -0.00 | 0.00 | -0.00 – -0.00 | <0.001 | -0.00 | 0.00 | -0.00 – -0.00 | <0.001 | -0.00 | 0.00 | -0.00 – -0.00 | <0.001 |
| Poverty*income per capita | 0.00 | 0.00 | 0.00 – 0.00 | <0.001 | ||||||||
| Bachelors proportion*income per capita | -0.00 | 0.00 | -0.00 – -0.00 | 0.012 | ||||||||
| Observations | 367 | 367 | 367 | |||||||||
| R2 / R2 adjusted | 0.563 / 0.548 | 0.580 / 0.567 | 0.571 / 0.555 | |||||||||
set.seed(1)
train = trainControl(method = "cv", number = 5)
model_train1 = train(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density,data = cdi_model,
trControl = train,
method = 'lm',
na.action = na.pass)
print(model_train1)
## Linear Regression
##
## 367 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 293, 294, 294, 294, 293
## Resampling results:
##
## RMSE Rsquared MAE
## 16.60425 0.5343078 12.52969
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(1)
train = trainControl(method = "cv", number = 5)
model_train2 = train(crime_rate_1000 ~
pop + pop18 +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 +
poverty*pcincome, data = cdi_model,
trControl = train,
method = 'lm',
na.action = na.pass)
summary(model_train2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.933 -8.984 -0.825 9.062 64.974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.143e+01 1.568e+01 -2.642 0.008603 **
## pop 6.001e-05 1.379e-05 4.352 1.76e-05 ***
## pop18 9.277e-01 2.321e-01 3.996 7.84e-05 ***
## poverty -2.777e+00 1.095e+00 -2.536 0.011647 *
## unemp 9.432e-01 4.847e-01 1.946 0.052468 .
## pcincome 9.798e-04 6.151e-04 1.593 0.112068
## region2 1.079e+01 2.573e+00 4.194 3.47e-05 ***
## region3 2.769e+01 2.641e+00 10.485 < 2e-16 ***
## region4 2.132e+01 3.033e+00 7.028 1.08e-11 ***
## beds_rate_1000 1.988e+00 7.238e-01 2.746 0.006339 **
## `pop:pcincome` -2.457e-09 6.640e-10 -3.700 0.000249 ***
## `poverty:pcincome` 2.957e-04 6.766e-05 4.371 1.63e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 355 degrees of freedom
## Multiple R-squared: 0.5802, Adjusted R-squared: 0.5672
## F-statistic: 44.61 on 11 and 355 DF, p-value: < 2.2e-16
set.seed(1)
train = trainControl(method = "cv", number = 5)
model_train3 = train(crime_rate_1000 ~
pop + pop18 + bagrad +
poverty + unemp + pcincome + pcincome*pop + region +
beds_rate_1000 + density +
pcincome*bagrad, data = cdi_model,
trControl = train,
method = 'lm',
na.action = na.pass)
summary(model_train3)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.88 -9.84 -0.85 8.19 61.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.141e+02 1.963e+01 -5.814 1.37e-08 ***
## pop 6.518e-05 1.453e-05 4.487 9.79e-06 ***
## pop18 1.048e+00 3.282e-01 3.193 0.001537 **
## bagrad 1.252e+00 6.865e-01 1.824 0.069046 .
## poverty 1.906e+00 3.995e-01 4.770 2.69e-06 ***
## unemp 9.002e-01 5.304e-01 1.697 0.090555 .
## pcincome 5.012e-03 9.351e-04 5.360 1.51e-07 ***
## region2 1.244e+01 2.603e+00 4.778 2.60e-06 ***
## region3 2.979e+01 2.662e+00 11.191 < 2e-16 ***
## region4 2.432e+01 3.107e+00 7.827 5.87e-14 ***
## beds_rate_1000 2.740e+00 6.944e-01 3.946 9.58e-05 ***
## density 1.007e-03 6.710e-04 1.500 0.134491
## `pop:pcincome` -2.706e-09 7.018e-10 -3.856 0.000137 ***
## `bagrad:pcincome` -8.009e-05 3.161e-05 -2.534 0.011723 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.09 on 353 degrees of freedom
## Multiple R-squared: 0.5708, Adjusted R-squared: 0.555
## F-statistic: 36.11 on 13 and 353 DF, p-value: < 2.2e-16
model <- c("1", "2", "3")
RMSE <- c(round(model_train1$results$RMSE, 2), round(model_train2$results$RMSE,2),
round(model_train3$results$RMSE, 2))
R_sq <- c(round(model_train1$results$Rsquared, 3),
round(model_train2$results$Rsquared, 3),
round(model_train3$results$Rsquared, 3))
RMSE_table <- data.frame(model, RMSE, R_sq)
coefs_1 = model_train1$finalModel$coefficients
names_1 = model_train1$finalModel$xNames
knitr::kable(RMSE_table, caption = "RMSE table for three models")
| model | RMSE | R_sq |
|---|---|---|
| 1 | 16.60 | 0.534 |
| 2 | 16.20 | 0.561 |
| 3 | 16.46 | 0.542 |
The second model has the lowest RMSE.
test_data = test_data %>%
mutate(
y = crime_rate_1000,
y_model_1 = predict(model_train1,test_data),
y_model_2 = predict(model_train2,test_data),
y_model_3 = predict(model_train3,test_data))
RMSPE_1 = sqrt(mean((test_data$y-test_data$y_model_1)^2))
RMSPE_2 = sqrt(mean((test_data$y-test_data$y_model_2)^2))
RMSPE_3 = sqrt(mean((test_data$y-test_data$y_model_3)^2))
model_assessment =
tibble(
RMSPE_1 = round(RMSPE_1,2),
RMSPE_2 = round(RMSPE_2,2),
RMSPE_3 = round(RMSPE_3,2)) %>%
pivot_longer(RMSPE_1:RMSPE_3,
names_to = "model",
names_prefix = "RMSPE_",
values_to = "RMSPE") %>%
left_join(RMSE_table,by="model") %>%
dplyr::select(Model=model,R_square = R_sq,RMSE,RMSPE)
knitr::kable(model_assessment, caption = "Model assessment table", booktabs = TRUE)
| Model | R_square | RMSE | RMSPE |
|---|---|---|---|
| 1 | 0.534 | 16.60 | 12.06 |
| 2 | 0.561 | 16.20 | 12.32 |
| 3 | 0.542 | 16.46 | 11.90 |